CSS Workshop

Some tastes on Computational Social Sciences

KT Wong

Faculty of Social Sciences, HKU

2025-06-26

Monty Hall Game

Some background about Monty Hall Game

  • Based on the American television game show Let’s Make a Deal and its host, named Monty Hall

  • The game involves three doors

    • behind one of which is a car (the prize)
    • behind the other two are goats (not the prize)
  • You pick a door — say, door 1

  • Now, the host, who knows what’s behind the doors, opens one of the other doors — say, door 2 — which reveals a goat

  • The host then asks you:

    • Do you want to stay with door 1, or
    • would you like to switch to door 3?
  • What should you do?

Use a computational method to simulate the Monty Hall game

Code
# Simulate the Monty Hall game
doors <- c("Door 1", "Door 2", "Door 3")

win_count <- 0

n<- 10000  # Number of simulations

for (i in 1:n) {
  # Randomly place the car behind one of the doors
  car_door <- sample(doors, 1, replace = TRUE)
  
  # Player randomly chooses a door
  player_choice <- sample(doors, 1, replace = TRUE)
  
  # Check if the player wins by staying with their choice
  if (player_choice == car_door) {
    win_count <- win_count + 1
  }
}

print(paste("the Probability of Winning by NOT switching:", win_count/n))
[1] "the Probability of Winning by NOT switching: 0.326"

the scenario of switching a door

Code
win_count_switch <- 0

for (i in 1:n) {
  # Randomly place the car behind one of the doors
  car_door <- sample(doors, 1, replace = TRUE)
  
  # Player randomly chooses a door
  player_choice <- sample(doors, 1, replace = TRUE)
  
  # Host opens a door that is not the player's choice and does not have the car
  remaining_doors <- setdiff(doors, c(player_choice, car_door))
  host_opens <- sample(remaining_doors, 1, replace = TRUE)
  
  # The player switches to the remaining closed door
  switch_choice <- setdiff(doors, c(player_choice, host_opens))
  
  # Check if the player wins by switching
  if (switch_choice == car_door) {
    win_count_switch <- win_count_switch + 1
  }
}

print(paste("the Probability of Winning by switching:", win_count_switch/n))
[1] "the Probability of Winning by switching: 0.6701"

Write a function to simulate the Monty Hall game

Code
monty_hall <- function(d = 3, switch = FALSE) {
  
  prize <- sample(1:d, 1, replace = TRUE)        
  
  choice <- sample(1:d, 1, replace = TRUE)    
  
  host_choices <- setdiff(1:d, c(choice, prize))
  host_open <- if(length(host_choices) > 1) sample(host_choices, 1, replace = FALSE) else host_choices
  
  if(switch) {
    remaining <- setdiff(1:d, c(choice, host_open))
    final_choice <- if(length(remaining) > 1) sample(remaining, 1, replace = FALSE) else remaining
  } else {
    final_choice <- choice
  }
  
  win <- final_choice == prize
  return(win)
  
}

simulate and plot the probability of winning by switching

  • Simulate the Monty Hall game for different numbers of rounds
Code
set.seed(123)
rounds <- c(10, 20, 30, 50, 100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000)

prob_switch <- sapply(rounds, function(r) {
  mean(replicate(r, monty_hall(switch = TRUE)))
})

prob_stay <- sapply(rounds, function(r) {
  mean(replicate(r, monty_hall(switch = FALSE)))
})

df <- data.frame(
  rounds = rep(rounds, 2),
  probability = c(prob_switch, prob_stay),
  strategy = rep(c("Switch", "Stay"), each = length(rounds))
)
  • Plot the probabilities of winning by switching and staying
Code
library(ggplot2)

ggplot(df, 
       aes(x = rounds, y = probability, color = strategy)) +
  geom_line() +
  geom_point() +
  geom_hline(yintercept = 2/3, linetype = "dashed", color = "blue") +
  geom_hline(yintercept = 1/3, linetype = "dashed", color = "red") +
  geom_text(aes(x = max(rounds), y = 2/3, label = "2/3"), 
            hjust = -0.2, vjust = -0.5, color = "blue") +  # Annotate 2/3
  geom_text(aes(x = max(rounds), y = 1/3, label = "1/3"), 
            hjust = -0.2, vjust = -0.5, color = "red") +   # Annotate 1/3
  scale_x_log10(breaks=rounds, labels=rounds) +
  scale_color_discrete(breaks = c("Stay", "Switch")) +  # Reorder legend labels
  labs(title = "Convergence of Monty Hall Probabilities",
       subtitle = "Dotted lines: Theoretical values (Switch=2/3, Stay=1/3)",
       x = "Number of Rounds (log scale)", y = "Probability of Winning",
       color = "Strategy Choice")+
  theme_minimal() +
  coord_cartesian(xlim = c(min(rounds), max(rounds) * 1.2))

Extension 1: Number of doors

  • Let us see how the probability of winning by switching changes as the number of doors increases
Code
doors <- 3:50
sim_per_door <- 10000

prob_switch_by_doors <- sapply(doors, function(n) {
  mean(replicate(sim_per_door, monty_hall(d = n, switch = TRUE)))
})

prob_stay_by_doors <- sapply(doors, function(n) {
  mean(replicate(sim_per_door, monty_hall(d = n, switch = FALSE)))
})


df2 <- data.frame(
  doors = rep(doors, 2),
  probability = c(prob_switch_by_doors, prob_stay_by_doors),
  strategy = rep(c("Switch", "Stay"), each = length(doors))
)
Code
ggplot(df2, 
       aes(x = doors, y = probability, color = strategy)) +
  geom_line() +
  geom_point() +
  geom_text(data = subset(df2, doors == min(df2$doors)), 
            aes(label = sprintf("%.3f", probability), 
                vjust = ifelse(strategy == "Switch", -0.5, 1.5)), 
            hjust = 1.2, show.legend = FALSE) +  # Annotate min(doors)
  geom_text(data = subset(df2, doors == max(df2$doors)), 
            aes(label = sprintf("%.3f", probability), 
                vjust = ifelse(strategy == "Switch", -0.5, 1.5)), 
            hjust = -0.2, show.legend = FALSE) +  # Annotate max(doors)
  theme_minimal() +
  labs(title = "Monty Hall Probabilities with More Doors",
       x = "Number of Doors", 
       y = "Probability of Winning",
       color = "Strategy Choice") +
  coord_cartesian(xlim = c(min(doors)-0.5, max(doors)+0.5))

Extension 2: Number of Open doors

  • Let us see how the probability of winning by switching changes as the number of open doors increases

  • Let there be 50 doors; you select one, and Monty Hall opens either 1 or 2 or …, 48 remaining doors

  • Modify the functions

Code
monty_hall_v2 <- function(d = 50, open=1, switch = FALSE) {
  
  prize <- sample(1:d, 1, replace = TRUE)        
  
  choice <- sample(1:d, 1, replace = TRUE)    
  
  host_choices <- setdiff(1:d, c(choice, prize))
  host_open <- if(length(host_choices) > 1) sample(host_choices, open, replace = FALSE) else host_choices
  
  if(switch) {
    remaining <- setdiff(1:d, c(choice, host_open))
    final_choice <- if(length(remaining) > 1) sample(remaining, 1, replace = FALSE) else remaining
  } else {
    final_choice <- choice
  }
  
  win <- final_choice == prize
  return(win)
  
}
  • simulate the Monty Hall game for different numbers of open doors
Code
open_doors <- 1:48

sim_per_open <- 10000

prob_switch_by_open <- sapply(open_doors, function(n) {
  mean(replicate(sim_per_open, monty_hall_v2(open = n, switch = TRUE)))
})

prob_stay_by_open <- sapply(open_doors, function(n) {
  mean(replicate(sim_per_open, monty_hall_v2(open = n, switch = FALSE)))
})

df3 <- data.frame(
  open_doors = rep(open_doors, 2),
  probability = c(prob_switch_by_open, prob_stay_by_open),
  strategy = rep(c("Switch", "Stay"), each = length(open_doors))
)
  • plot the probabilities of winning by switching and staying
Code
ggplot(df3, 
       aes(x = open_doors, y = probability, color = strategy)) +
  geom_line() +
  geom_point() +
  geom_text(data = subset(df3, open_doors == min(df3$open_doors)), 
            aes(label = sprintf("%.3f", probability), 
                vjust = ifelse(strategy == "Switch", -0.5, 1.5)), 
                hjust = 1.2, show.legend = FALSE,
                size=3) +  # Annotate min(open_doors)
  geom_text(data = subset(df3, open_doors == max(df3$open_doors)), 
            aes(label = sprintf("%.3f", probability), 
                vjust = ifelse(strategy == "Switch", -0.5, 1.5)), 
                hjust = -0.2, show.legend = FALSE,
                size=3) +  # Annotate max(open_doors)
  theme_minimal() +
  labs(title = "Monty Hall Probabilities with More Open Doors",
       x = "Number of Open Doors", 
       y = "Probability of Winning",
       color = "Strategy Choice") +
  coord_cartesian(xlim = c(min(open_doors)-2, max(open_doors)+2))

Expansion of Walmart

  • Walmart opened its first store in 1962 in Rogers, Arkansas, United States

  • Over the next several decades, it opened many stores within the United States and then around the world

  • Walmart has become one of the largest retail multinational companies in the world

  • This data set contains spatial and temporal information about Walmart store openings from the first opening on March 1, 1962 until August 1, 2006

  • three different types of stores, represented by the variable type

    • Wal-MartStore represents a standard Walmart store
    • SuperCenter is a standard Walmart store as well as a full supermarket
      • it often include pharmacies, garden shops, car service centers, and other specialty centers
    • DistributionCenter - distributes foods and goods to standard Walmart stores and Supercenters

Data set

  • The data set contains the following variables:
    • opendate: Opening date for the store
    • st.address: Street address of the store
    • city: City of the store
    • state: State of the store
    • type: Store type (Wal-MartStore, SuperCenter, DistributionCenter)
    • lat: latitude of the store location
    • long: longitude of the store location
Code
library(tidyverse)

walmart<- read_csv("https://raw.githubusercontent.com/kwan-MSDA/Bootcamp_2024/refs/heads/main/dataset/walmart.csv")


walmart <- walmart %>%
  mutate(type = recode(type,
                       "DistributionCenter" = "Distribution \ncenter",
                       "SuperCenter" = "Supercenter",
                       "Wal-MartStore" = "Walmart")) %>% 
  mutate(size = if_else(type == "Distribution \ncenter", 3, 1))

Map the location of Walmart stores

Code
library(maps)

ggplot() +
  borders(database = "state") +
  geom_point(aes(x = long, y = lat, 
                 color = type, size = size),
             data = walmart, 
             alpha = 0.6) +
  coord_quickmap() +
  scale_size_identity() +
  theme_void(base_size = 12) + # remove all extra formatting
  labs(color = "Type") # change the label for the legend

Plotting over time

  • The data set contains the opening date of each store, so we can make the maps to show how Walmart expanded over time

  • useful to define a function that subsets the data given a specified date

Code
library(tidyverse)
library(lubridate)

walmart.map <- function(data, date, show_legend = TRUE) {
  temp <- filter(data, opendate <= date) %>%
    mutate(size = if_else(type == "Distribution \ncenter", 3, 1))
  

  p<- ggplot() +
    borders(database = "state") +
    geom_point(data = temp,
               aes(x = long, y = lat, color = type, size = size),
               alpha = 0.6) +
    coord_quickmap() +
    scale_size_identity() +
    theme_void(base_size = 12) +
    labs(color = "Type") +
    ggtitle(year(date))
  
  if (show_legend) {
    p <- p + theme(legend.position = "bottom", legend.direction = "horizontal") +
      guides(size = "none")  # Suppress size legend, keep only color legend
  } 
  else {
    p <- p + theme(legend.position = "none")
  }
  
  return(p)
}
  • create a map for every ten years
Code
library(grid)
library(gridExtra)
library(patchwork)

p1<- walmart.map(walmart, as.Date("1974-12-31"), show_legend = FALSE)
p2<- walmart.map(walmart, as.Date("1984-12-31"), show_legend = FALSE)
p3<- walmart.map(walmart, as.Date("1994-12-31"), show_legend = FALSE)
p4<- walmart.map(walmart, as.Date("2004-12-31"), show_legend = TRUE)

# Combine plots in a 2x2 grid with a shared legend and main title
# Combine plots in a 2x2 grid with a shared legend and main title
(p1 + p2) / (p3 + p4) +
  plot_layout(guides = "collect", heights = c(1, 1, 0.2)) +
  plot_annotation(
    title = "Walmart Store Openings by Year",
    caption = ""  # Optional: ensures no extra caption interferes
  ) & 
  theme(
    legend.position = "right",
    legend.direction = "vertical",
    legend.box.background = element_rect(color = "black", linewidth = 0.5),
    legend.box.margin = margin(t = 2, r = 2, b = 2, l = 2), # space above legend
    plot.title = element_text(hjust = 0, size = 18, 
                              face = "bold", 
                              margin= margin(b = 10)) # space below title 
  )

Animation

  • The gganimate package allows us to create animations within a ggplot framework
    • An animation is essentially a series of static images shown one after the other
    • use the transition_states() function to create a series of plots
  • plot the expansion of Walmart by year
    • create a new variable in the data called \(year\)
    • within transition_states() function we specify the new year variable as the states argument
  • specify
    • how long we want each state visible with the state_length argument
    • how long we want each transition between states to last with the transition_length argument
    • add the shadow_mark() function to keep the points from previous states visible, adding more points on top
Code
options(device = "cairo_png")

library(gifski)
library(gganimate)
library(lubridate)

## round down to year from opendate
walmart <- walmart %>%
  mutate(year = year(opendate),
         year = factor(year, levels = as.character(sort(unique(year)))))

## create the animation

walmart_animated <-
  ggplot(data = walmart) +
  borders(database = "state") +
  geom_point(aes(x = long, y = lat,
                 color = type,
                 size = ifelse(type=="Distribution \ncenter", 2, 1),
                 alpha= 0.6),
                 show.legend = c(color = TRUE, size = FALSE)) +
  coord_quickmap() +
  theme_void(base_size=12) +
  transition_states(states = year,
                    transition_length = 0, # Instant transition
                    state_length = 1) +   # 1 second per year
  shadow_mark() +
  labs(title = "Year: {closest_state}") +
  scale_color_manual(values = c("Supercenter" = "blue", 
                              "Walmart" = "green", 
                              "Distribution \ncenter" = "red"
                              ),
                     name= "Type")

Animate the plot

Code
walmart_animated

Code
## save the animation
anim_save(filename="walmart.gif", 
          animation = walmart_animated, 
          width = 1920, height = 1080)

Racial Segregation

Racial Segregration

  • In 1969, Thomas C. Schelling developed a simple but striking model of racial segregation

    • Schelling was awarded the 2005 Nobel Prize in Economic Sciences (joint with Robert Aumann) for his work on segregation (and other research)
  • the model shows how local interactions can lead to surprising aggregate outcomes

  • Each household has relatively mild preference for neighbors of the same race

  • the model predicts strongly divided neighborhoods, with high levels of segregation

  • extreme segregation outcomes arise even though people’s preferences are not particularly extreme

Set-up of the model

  • Suppose we have two types of people: white people and black people

  • Assume there are \(n\) of each type

  • These people all live on a single unit square

    • the location of an individual is just a point \((x,y)\), where \(0 < x, y < 1\)
    • the set of all points \((x,y)\) satisfying is called the unit square (denoted by S)
  • an individual is happy if 5 or more of her 10 nearest neighbors are of the same type

    • otherwise, she is unhappy
  • Initially, individuals are mixed together (integrated)

    • each individual is now given the chance to stay or move
    • each individual stays if they are happy and moves if they are unhappy

Simulating the model

  • cycling through the set of all agents, each agent is now given the chance to stay or move
    • continue to cycle until no one wishes to move
Code
library(reticulate)
use_python("C:/Users/ktwong/AppData/Local/Programs/Python/Python312", required = TRUE)
Code
import matplotlib.pyplot as plt
from random import uniform, seed
from math import sqrt
import numpy as np

class Agent:

    def __init__(self, type):
        self.type = type
        self.draw_location()

    def draw_location(self):
        self.location = uniform(0, 1), uniform(0, 1)

    def get_distance(self, other):
        "Computes the euclidean distance between self and other agent."
        a = (self.location[0] - other.location[0])**2
        b = (self.location[1] - other.location[1])**2
        return sqrt(a + b)

    def happy(self,
                agents,                # List of other agents
                num_neighbors=10,      # No. of agents viewed as neighbors
                require_same_type=3):  # How many neighbors must be same type
        """
            True if a sufficient number of nearest neighbors are of the same
            type.
        """

        distances = []

        # Distances is a list of pairs (d, agent), where d is distance from
        # agent to self
        for agent in agents:
            if self != agent:
                distance = self.get_distance(agent)
                distances.append((distance, agent))

        # Sort from smallest to largest, according to distance
        distances.sort()

        # Extract the neighboring agents
        neighbors = [agent for d, agent in distances[:num_neighbors]]

        # Count how many neighbors have the same type as self
        num_same_type = sum(self.type == agent.type for agent in neighbors)
        return num_same_type >= require_same_type

    def update(self, agents, num_neighbors=10, require_same_type=3):
        "If not happy, then randomly choose new locations until happy."
        while not self.happy(agents, num_neighbors, require_same_type):
            self.draw_location()
  • write a plotting function with white individuals represented by orange dots and black ones represented by green dots
Code
def collect_distribution(agents):
    """Collect x, y coordinates for each agent type."""
    x_values_0, y_values_0 = [], []
    x_values_1, y_values_1 = [], []
    for agent in agents:
        x, y = agent.location
        if agent.type == 0:
            x_values_0.append(x)
            y_values_0.append(y)
        else:
            x_values_1.append(x)
            y_values_1.append(y)
    return x_values_0, y_values_0, x_values_1, y_values_1


def plot_side_by_side(initial_data, final_data, cycle_num):
    """Plot initial and final distributions side by side."""
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    plot_args = {'markersize': 8, 'alpha': 0.6}

    # Initial distribution
    ax1.set_facecolor('azure')
    ax1.plot(initial_data[0], initial_data[1], 'o', markerfacecolor='orange', **plot_args)
    ax1.plot(initial_data[2], initial_data[3], 'o', markerfacecolor='green', **plot_args)
    ax1.set_title('Initial Distribution (Cycle 0)')
    ax1.set_xlabel('X')
    ax1.set_ylabel('Y')

    # Final distribution
    ax2.set_facecolor('azure')
    ax2.plot(final_data[0], final_data[1], 'o', markerfacecolor='orange', **plot_args)
    ax2.plot(final_data[2], final_data[3], 'o', markerfacecolor='green', **plot_args)
    ax2.set_title(f'Final Distribution (Cycle {cycle_num-1})')
    ax2.set_xlabel('X')
    ax2.set_ylabel('Y')

    plt.tight_layout()
    plt.show()
  • Now we can run the simulation
Code
def run_simulation(num_of_type_0=1000,
                   num_of_type_1=1000,
                   max_iter=100000,       # Maximum number of iterations
                   set_seed=123,
                   num_neighbors=10,
                   require_same_type=5):

    # Set the seed for reproducibility
    seed(set_seed)

    # Create a list of agents of type 0
    agents = [Agent(0) for _ in range(num_of_type_0)]
    # Append a list of agents of type 1
    agents.extend(Agent(1) for _ in range(num_of_type_1))

    # Collect initial distribution
    initial_data = collect_distribution(agents)
    
    # Initialize a counter
    count = 1

    # Loop until no agent wishes to move
    while count < max_iter:
        print(f'Entering loop {count}')
        count += 1
        no_one_moved = True
        for agent in agents:
            old_location = agent.location
            agent.update(agents, num_neighbors, require_same_type)
            if agent.location != old_location:
                no_one_moved = False
        if no_one_moved:
            break

    # Collect final distribution
    final_data = collect_distribution(agents)

    # Plot initial and final distributions side by side
    plot_side_by_side(initial_data, final_data, count)

    if count < max_iter:
        print(f'Converged after {count} iterations.')
    else:
        print('Hit iteration bound and terminated.')
        
        
  • try running the simulation with different numbers of agents
Code
# Run the simulation with example parameters
run_simulation(
    num_of_type_0=1000,
    num_of_type_1=1000,
    max_iter=100000,
    set_seed=123,
    num_neighbors=10,
    require_same_type=5  # Example: require 5 neighbors of the same type
)
Entering loop 1
Entering loop 2
Entering loop 3
Entering loop 4
Entering loop 5
Entering loop 6
Entering loop 7
Converged after 8 iterations.

O-ring theory

O-ring theory

  • The O-ring theory of economic development was proposed in 1993 by Michael Kremer, who was awarded the 2019 Nobel Prize in Economic Sciences for his work on economic growth and development (along with Esther Duflo and Abhijit Banerjee)
  • The theory is based on the idea that the multiplicative nature of production in tasks require high-quality inputs working together
    • The theory is named after the O-ring, a critical component in the Space Shuttle Challenger that failed in 1986, leading to the shuttle’s explosion shortly after launch
  • firms combine inputs (e.g., workers’ skills) in a way that the failure or lower quality of one input significantly reduces output
    • The theory suggests that countries with low average skill levels may not be able to produce high-quality goods, as the risk of failure in one task can lead to overall poor performance
    • O-rings in a spaceship where one weak link can cause catastrophic failure
    • this leads to assortative matching, where high-skill workers cluster together, amplifying income disparities

Simulating the O-ring theory

  • models workers with slightly skewed ability distributions

    • assigns them to firms based on skill levels
    • calculates output using an O-ring production function
  • Goal: The simulation shows how even modest ability differences can lead to significant income inequality

  • Some setting

Code
import numpy as np
import matplotlib.pyplot as plt

# Parameters
n_workers = 1000  # Number of workers
n_firms = 100     # Number of firms

workers_per_firm = n_workers // n_firms

alpha = 0.8       # O-ring production function parameter (0 < alpha < 1)
sigma = 0.2       # Standard deviation for ability distribution (small skew)

Key Steps

  • Step 1: Generate worker abilities (log-normal for slight skew)
Code
np.random.seed(42)
abilities = np.random.lognormal(mean=0, sigma=sigma, size=n_workers)
  • Step 2: Sort workers by ability and assign to firms (assortative matching)
Code
sorted_indices = np.argsort(abilities)[::-1]  # Sort in descending order
sorted_abilities = abilities[sorted_indices]
  • Assign workers to firms: top workers to firm 1, next to firm 2, etc.
Code
firm_abilities = sorted_abilities.reshape(n_firms, workers_per_firm)

Key Steps (continued)

  • Step 3: O-ring production function
    • let output is \(Y\)
    • Notice that \(Y = A * (\prod^n_{i=1} a_i)^\alpha\), where A is a constant (set to 1)
Code
firm_outputs = np.prod(firm_abilities, axis=1) ** alpha
  • Step 4: Assume wages are proportional to firm output
    • Each worker in a firm earns a wage proportional to the firm’s output
Code

wages = np.repeat(firm_outputs, workers_per_firm) / workers_per_firm
  • Step 5: Calculate income inequality (Gini coefficient)
Code

def gini_coefficient(incomes):
    n = len(incomes)
    sorted_incomes = np.sort(incomes)
    cumulative_income = np.cumsum(sorted_incomes)
    # Lorenz curve area
    lorenz_area = np.sum(cumulative_income) / (n * cumulative_income[-1])
    # Gini = (Area between line of equality and Lorenz curve) / (Area under line of equality)
    gini = 1 - 2 * lorenz_area
    return gini

gini = gini_coefficient(wages)

Step 6: Plot the distribution of wages and display the Gini coefficient

Code
import seaborn as sns
import matplotlib.pyplot as plt


# Set a modern style
sns.set_style("whitegrid")  # Adds a clean grid background
plt.rcParams['font.family'] = 'Arial'  # Use a clean, professional font
plt.rcParams['axes.titlesize'] = 14  # Increase title font size
plt.rcParams['axes.labelsize'] = 12  # Increase label font size
plt.rcParams['xtick.labelsize'] = 10  # Adjust tick label size
plt.rcParams['ytick.labelsize'] = 10

# Create figure with a larger size for better visibility
plt.figure(figsize=(14, 6), facecolor='white')

# Plot ability distribution
plt.subplot(1, 2, 1)
sns.histplot(abilities, bins=50, kde=True, color='#1f77b4', alpha=0.7, stat='density', 
             edgecolor='black', linewidth=0.5)
plt.title(f'Ability Distribution\n(Mean={np.mean(abilities):.2f}, Std={np.std(abilities):.2f})', 
          fontsize=14, fontweight='bold', pad=15)
plt.xlabel('Ability', fontsize=12)
plt.ylabel('Density', fontsize=12)

# Add mean line
plt.axvline(np.mean(abilities), color='red', linestyle='--', linewidth=2, label='Mean')
plt.legend()

# Plot wage distribution
plt.subplot(1, 2, 2)
sns.histplot(wages, bins=50, kde=True, color='#2ca02c', alpha=0.7, stat='density', 
             edgecolor='black', linewidth=0.5)
plt.title(f'Wage Distribution\n(Gini Coefficient={gini:.3f})', 
          fontsize=14, fontweight='bold', pad=15)
plt.xlabel('Wage', fontsize=12)
plt.ylabel('Density', fontsize=12)

# Add median line
plt.axvline(np.median(wages), color='purple', linestyle='--', linewidth=2, label='Median')
plt.legend()

# Adjust layout and add a subtle background color
plt.tight_layout()
plt.suptitle('Distribution Analysis', fontsize=16, fontweight='bold', y=1.05)
plt.gcf().set_facecolor('#f5f5f5')  # Light gray background for the figure

# Show plot
plt.show()

Code
# Print summary statistics
print(f"Ability: Mean={np.mean(abilities):.2f}, Std={np.std(abilities):.2f}")
Ability: Mean=1.02, Std=0.20
Code
print(f"Wages: Mean={np.mean(wages):.2f}, Std={np.std(wages):.2f}")
Wages: Mean=0.36, Std=0.91
Code
print(f"Gini Coefficient: {gini:.3f}")
Gini Coefficient: 0.738

Social Network

Code
library(tidyverse)
## load the data

follow <- read_csv("https://raw.githubusercontent.com/kwan-MSDA/FOSS/refs/heads/main/twitter-following.csv")

senator <- read_csv("https://raw.githubusercontent.com/kwan-MSDA/FOSS/refs/heads/main/twitter-senator.csv")
## examine
head(follow)
# A tibble: 6 × 2
  following    followed       
  <chr>        <chr>          
1 SenAlexander RoyBlunt       
2 SenAlexander SenatorBurr    
3 SenAlexander JohnBoozman    
4 SenAlexander SenJohnBarrasso
5 SenAlexander SenBennetCO    
6 SenAlexander SenDanCoats    
Code
head(senator)
# A tibble: 6 × 4
  screen_name     name            party state
  <chr>           <chr>           <chr> <chr>
1 SenAlexander    Lamar Alexander R     TN   
2 RoyBlunt        Roy Blunt       R     MO   
3 SenatorBoxer    Barbara Boxer   D     CA   
4 SenSherrodBrown Sherrod Brown   D     OH   
5 SenatorBurr     Richard Burr    R     NC   
6 SenatorBaldwin  Tammy Baldwin   D     WI   

Building the Twitter Network

Code
library(igraph)

twitter_adj <- graph_from_edgelist(as.matrix(follow),
                                   directed = TRUE)


senator <- mutate(senator,
         indegree = degree(twitter_adj, mode = "in"),
         outdegree = degree(twitter_adj, mode = "out"))
Code
arrange(senator, desc(indegree)) %>%
  slice(1:3) %>%
  select(name, party, state, indegree, outdegree)
# A tibble: 3 × 5
  name              party state indegree outdegree
  <chr>             <chr> <chr>    <dbl>     <dbl>
1 Tom Cotton        R     AR          64        15
2 Richard J. Durbin D     IL          60        87
3 John Barrasso     R     WY          58        79
Code
## with slice_max
slice_max(senator, order_by = indegree, n = 3) %>%
  arrange(desc(indegree)) %>%
  select(name, party, state, indegree, outdegree)
# A tibble: 5 × 5
  name              party state indegree outdegree
  <chr>             <chr> <chr>    <dbl>     <dbl>
1 Tom Cotton        R     AR          64        15
2 Richard J. Durbin D     IL          60        87
3 John Barrasso     R     WY          58        79
4 Joe Donnelly      D     IN          58         9
5 Orrin G. Hatch    R     UT          58        50

Some key measures for network

  • define scales to reuse for the plots
Code
scale_color_parties <- scale_color_manual("Party",
                                           values = c(R = "red",
                                                       D = "blue",
                                                       I = "green"),
                                          labels = c(R = "Republican",
                                                     D= "Democrat",
                                                     I = "Independent"))
scale_shape_parties <- scale_shape_manual("Party",
                                          values = c(R = 16,
                                                     D = 17,
                                                     I = 4),
                                          labels = c(R = "Republican",
                                                     D= "Democrat",
                                                     I = "Independent"))
  • Calculate closeness measures and graph
Code
senator %>%
  mutate(closeness_in = closeness(twitter_adj,
                                          mode = "in"),
         closeness_out = closeness(twitter_adj,
                                           mode = "out")) %>%
  ggplot(aes(x = closeness_in, y = closeness_out,
             color = party, shape = party)) +
  geom_point() +
  scale_color_parties +
  scale_shape_parties +
  labs(main = "Closeness", x = "Incoming path",
       y = "Outgoing path") +
  theme_classic(base_size = 22) +
  theme(legend.position = "none")

  • Calculate betweenness measures and graph
Code
senator %>%
  mutate(betweenness_dir = betweenness(twitter_adj,
                                               directed = TRUE),
         betweenness_undir = betweenness(twitter_adj,
                                                 directed = FALSE)) %>%
  ggplot(aes(x = betweenness_dir,
             y = betweenness_undir, color = party,
             shape = party)) +
  geom_point() +
  scale_color_parties +
  scale_shape_parties +
  labs(main = "Betweenness", x = "Directed", y = "Undirected") +
  theme_classic(base_size = 22)

  • Calculate the pagerank
Code
senator <- mutate(senator,
                  page_rank = page_rank(twitter_adj)[["vector"]])

## Create igraph object
net <- graph_from_data_frame(d = follow,
                             vertices = senator,
                             directed=T)
## View the new object
net
IGRAPH 34e4368 DN-- 91 3859 -- 
+ attr: name (v/c), party (v/c), state (v/c), indegree (v/n), outdegree
| (v/n), page_rank (v/n)
+ edges from 34e4368 (vertex names):
 [1] Lamar Alexander->Roy Blunt         Lamar Alexander->Richard Burr     
 [3] Lamar Alexander->John Boozman      Lamar Alexander->John Barrasso    
 [5] Lamar Alexander->Michael F. Bennet Lamar Alexander->Daniel Coats     
 [7] Lamar Alexander->Susan M. Collins  Lamar Alexander->John Cornyn      
 [9] Lamar Alexander->Bob Corker        Lamar Alexander->Michael B. Enzi  
[11] Lamar Alexander->Joni Ernst        Lamar Alexander->Chuck Grassley   
[13] Lamar Alexander->Cory Gardner      Lamar Alexander->Orrin G. Hatch   
+ ... omitted several edges
Code
## look at some network edges, nodes, and node (vertex) attributes
head(E(net)) ## E() for edges
+ 6/3859 edges from 34e4368 (vertex names):
[1] Lamar Alexander->Roy Blunt         Lamar Alexander->Richard Burr     
[3] Lamar Alexander->John Boozman      Lamar Alexander->John Barrasso    
[5] Lamar Alexander->Michael F. Bennet Lamar Alexander->Daniel Coats     
Code
head(V(net)) ## V() for vertex
+ 6/91 vertices, named, from 34e4368:
[1] Lamar Alexander Roy Blunt       Barbara Boxer   Sherrod Brown  
[5] Richard Burr    Tammy Baldwin  
Code
head(V(net)$party)
[1] "R" "R" "D" "D" "R" "D"
Code
## Create igraph object
net <- graph_from_data_frame(d = follow,
                             vertices = senator,
                             directed=T)
## View the new object
net
IGRAPH 34e9b0f DN-- 91 3859 -- 
+ attr: name (v/c), party (v/c), state (v/c), indegree (v/n), outdegree
| (v/n), page_rank (v/n)
+ edges from 34e9b0f (vertex names):
 [1] Lamar Alexander->Roy Blunt         Lamar Alexander->Richard Burr     
 [3] Lamar Alexander->John Boozman      Lamar Alexander->John Barrasso    
 [5] Lamar Alexander->Michael F. Bennet Lamar Alexander->Daniel Coats     
 [7] Lamar Alexander->Susan M. Collins  Lamar Alexander->John Cornyn      
 [9] Lamar Alexander->Bob Corker        Lamar Alexander->Michael B. Enzi  
[11] Lamar Alexander->Joni Ernst        Lamar Alexander->Chuck Grassley   
[13] Lamar Alexander->Cory Gardner      Lamar Alexander->Orrin G. Hatch   
+ ... omitted several edges
Code
## look at some network edges, nodes, and node (vertex) attributes
head(E(net)) ## E() for edges
+ 6/3859 edges from 34e9b0f (vertex names):
[1] Lamar Alexander->Roy Blunt         Lamar Alexander->Richard Burr     
[3] Lamar Alexander->John Boozman      Lamar Alexander->John Barrasso    
[5] Lamar Alexander->Michael F. Bennet Lamar Alexander->Daniel Coats     
Code
head(V(net)) ## V() for vertex
+ 6/91 vertices, named, from 34e9b0f:
[1] Lamar Alexander Roy Blunt       Barbara Boxer   Sherrod Brown  
[5] Richard Burr    Tammy Baldwin  
Code
head(V(net)$party)
[1] "R" "R" "D" "D" "R" "D"
Code
# Identify top 5 senators by PageRank for each party
top5_by_party <- senator %>%
  group_by(party) %>%
  top_n(5, page_rank) %>%
  arrange(party, desc(page_rank)) %>%
  select(name, party, page_rank)  # Assuming 'name' is the column with senator names

# Create a vector of labels, defaulting to NA
V(net)$label <- NA

# Assign labels for the top 5 senators per party
# Match senator names from top5_by_party to vertex names in the network
for (i in 1:nrow(top5_by_party)) {
  vertex_idx <- which(V(net)$name == top5_by_party$name[i])  # Adjust 'name' to your vertex ID column
  if (length(vertex_idx) > 0) {
    V(net)$label[vertex_idx] <- top5_by_party$name[i]
  }
}

## Vector of colors of the nodes based on party
col <- senator %>%
  mutate(col = case_when(party == "R" ~ "red",
                         party == "D" ~ "blue",
                         TRUE ~ "black")) %>%
  select(col) %>% pull()

# Plot the network with labels for top 5 per party
plot(net, 
     vertex.size = V(net)$page_rank * 1000,
     vertex.label = V(net)$label,  # Use the new label attribute
     vertex.label.cex = 0.8,       # Adjust label size
     vertex.label.color = "black", # Label color
     vertex.color = col,
     edge.arrow.size = 0.1,
     edge.width = 0.5,
     main = "Twitter Network of US Senators (Top 5 PageRank Labeled)",
     layout = layout_with_kk(net, niter = 1000))

# Add a legend for party colors
legend("topright",  # Position of the legend (adjust as needed: "topleft", "bottomright", etc.)
       legend = c("Republican", "Democrat", "Other"),
       col = c("red", "blue", "black"),
       pch = 19,  # Solid circle symbol for nodes
       bty = "n", # No box around legend
       cex = 0.8) # Adjust legend text size